import pandas as pd
import requests
import os
import json
import numpy as np
WeRateDogs Twitter archive data
archive = pd.read_csv('twitter-archive-enhanced.csv')
Tweet image predictions
# Getting the image predictions from Udacity server
url = ' https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
response = requests.get(url)
response
<Response [200]>
folder_name = 'Image_predictions'
if not os.path.exists(folder_name):
os.makedirs(folder_name)
# Saving the obtained data
with open(os.path.join(folder_name, url.split('/')[-1]), mode = 'wb') as file:
file.write(response.content)
Image_predictions = pd.read_csv('image-predictions.tsv', sep = '\t')
Additional data from the twitter API
#import tweepy
#from tweepy import OAuthHandler
#from timeit import default_timer as timer
# Query Twitter API for each tweet in the Twitter archive and save JSON in a text file
# These are hidden to comply with Twitter's API terms and conditions
#consumer_key = 'HIDDEN'
#consumer_secret = 'HIDDEN'
#access_token = 'HIDDEN'
#access_secret = 'HIDDEN'
#auth = OAuthHandler(consumer_key, consumer_secret)
#auth.set_access_token(access_token, access_secret)
#api = tweepy.API(auth, wait_on_rate_limit=True)
# NOTE TO STUDENT WITH MOBILE VERIFICATION ISSUES:
# df_1 is a DataFrame with the twitter_archive_enhanced.csv file. You may have to
# change line 17 to match the name of your DataFrame with twitter_archive_enhanced.csv
# NOTE TO REVIEWER: this student had mobile verification issues so the following
# Twitter API code was sent to this student from a Udacity instructor
# Tweet IDs for which to gather additional data via Twitter's API
#archive = pd.read_csv('twitter-archive-enhanced.csv')
#tweet_ids = archive.tweet_id.values
#len(tweet_ids)
# Query Twitter's API for JSON data for each tweet ID in the Twitter archive
#count = 0
#fails_dict = {}
#start = timer()
# Save each tweet's returned JSON as a new line in a .txt file
#with open('tweet_json.txt', 'w') as outfile:
# This loop will likely take 20-30 minutes to run because of Twitter's rate limit
#for tweet_id in tweet_ids:
#count += 1
#print(str(count) + ": " + str(tweet_id))
#try:
#tweet = api.get_status(tweet_id, tweet_mode='extended')
#print("Success")
#json.dump(tweet._json, outfile)
#outfile.write('\n')
#except AttributeError:
#print("Fail")
#fails_dict[tweet_id] = e
#pass
#end = timer()
#print(end - start)
#print(fails_dict)
#list of dictionaries to build file by file and later convert to DataFrame
df_list = []
with open ('tweet-json.txt', 'r') as file:
for line in file:
data = json.loads(line)
tweet_id = data['id']
created_at = data['created_at']
retweeted_status = data['retweeted']
favorite_count = data['favorite_count']
retweet_count = data['retweet_count']
#Append to list of dictionaries
df_list.append({'tweet_id' :tweet_id,
'created_at': created_at,
'retweeted': retweeted_status,
'favorite_count': favorite_count,
'retweet_count': retweet_count})
tweet_json = pd.DataFrame(df_list, columns = ['tweet_id','favorite_count','retweet_count'])
archive
| tweet_id | in_reply_to_status_id | in_reply_to_user_id | timestamp | source | text | retweeted_status_id | retweeted_status_user_id | retweeted_status_timestamp | expanded_urls | rating_numerator | rating_denominator | name | doggo | floofer | pupper | puppo | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 892420643555336193 | NaN | NaN | 2017-08-01 16:23:56 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Phineas. He's a mystical boy. Only eve... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/892420643... | 13 | 10 | Phineas | None | None | None | None |
| 1 | 892177421306343426 | NaN | NaN | 2017-08-01 00:17:27 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Tilly. She's just checking pup on you.... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/892177421... | 13 | 10 | Tilly | None | None | None | None |
| 2 | 891815181378084864 | NaN | NaN | 2017-07-31 00:18:03 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Archie. He is a rare Norwegian Pouncin... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/891815181... | 12 | 10 | Archie | None | None | None | None |
| 3 | 891689557279858688 | NaN | NaN | 2017-07-30 15:58:51 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Darla. She commenced a snooze mid meal... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/891689557... | 13 | 10 | Darla | None | None | None | None |
| 4 | 891327558926688256 | NaN | NaN | 2017-07-29 16:00:24 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Franklin. He would like you to stop ca... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/891327558... | 12 | 10 | Franklin | None | None | None | None |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2351 | 666049248165822465 | NaN | NaN | 2015-11-16 00:24:50 +0000 | <a href="http://twitter.com/download/iphone" r... | Here we have a 1949 1st generation vulpix. Enj... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/666049248... | 5 | 10 | None | None | None | None | None |
| 2352 | 666044226329800704 | NaN | NaN | 2015-11-16 00:04:52 +0000 | <a href="http://twitter.com/download/iphone" r... | This is a purebred Piers Morgan. Loves to Netf... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/666044226... | 6 | 10 | a | None | None | None | None |
| 2353 | 666033412701032449 | NaN | NaN | 2015-11-15 23:21:54 +0000 | <a href="http://twitter.com/download/iphone" r... | Here is a very happy pup. Big fan of well-main... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/666033412... | 9 | 10 | a | None | None | None | None |
| 2354 | 666029285002620928 | NaN | NaN | 2015-11-15 23:05:30 +0000 | <a href="http://twitter.com/download/iphone" r... | This is a western brown Mitsubishi terrier. Up... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/666029285... | 7 | 10 | a | None | None | None | None |
| 2355 | 666020888022790149 | NaN | NaN | 2015-11-15 22:32:08 +0000 | <a href="http://twitter.com/download/iphone" r... | Here we have a Japanese Irish Setter. Lost eye... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/666020888... | 8 | 10 | None | None | None | None | None |
2356 rows × 17 columns
Image_predictions
| tweet_id | jpg_url | img_num | p1 | p1_conf | p1_dog | p2 | p2_conf | p2_dog | p3 | p3_conf | p3_dog | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 666020888022790149 | https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg | 1 | Welsh_springer_spaniel | 0.465074 | True | collie | 0.156665 | True | Shetland_sheepdog | 0.061428 | True |
| 1 | 666029285002620928 | https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg | 1 | redbone | 0.506826 | True | miniature_pinscher | 0.074192 | True | Rhodesian_ridgeback | 0.072010 | True |
| 2 | 666033412701032449 | https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg | 1 | German_shepherd | 0.596461 | True | malinois | 0.138584 | True | bloodhound | 0.116197 | True |
| 3 | 666044226329800704 | https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg | 1 | Rhodesian_ridgeback | 0.408143 | True | redbone | 0.360687 | True | miniature_pinscher | 0.222752 | True |
| 4 | 666049248165822465 | https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg | 1 | miniature_pinscher | 0.560311 | True | Rottweiler | 0.243682 | True | Doberman | 0.154629 | True |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2070 | 891327558926688256 | https://pbs.twimg.com/media/DF6hr6BUMAAzZgT.jpg | 2 | basset | 0.555712 | True | English_springer | 0.225770 | True | German_short-haired_pointer | 0.175219 | True |
| 2071 | 891689557279858688 | https://pbs.twimg.com/media/DF_q7IAWsAEuuN8.jpg | 1 | paper_towel | 0.170278 | False | Labrador_retriever | 0.168086 | True | spatula | 0.040836 | False |
| 2072 | 891815181378084864 | https://pbs.twimg.com/media/DGBdLU1WsAANxJ9.jpg | 1 | Chihuahua | 0.716012 | True | malamute | 0.078253 | True | kelpie | 0.031379 | True |
| 2073 | 892177421306343426 | https://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg | 1 | Chihuahua | 0.323581 | True | Pekinese | 0.090647 | True | papillon | 0.068957 | True |
| 2074 | 892420643555336193 | https://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg | 1 | orange | 0.097049 | False | bagel | 0.085851 | False | banana | 0.076110 | False |
2075 rows × 12 columns
tweet_json
| tweet_id | favorite_count | retweet_count | |
|---|---|---|---|
| 0 | 892420643555336193 | 39467 | 8853 |
| 1 | 892177421306343426 | 33819 | 6514 |
| 2 | 891815181378084864 | 25461 | 4328 |
| 3 | 891689557279858688 | 42908 | 8964 |
| 4 | 891327558926688256 | 41048 | 9774 |
| ... | ... | ... | ... |
| 2349 | 666049248165822465 | 111 | 41 |
| 2350 | 666044226329800704 | 311 | 147 |
| 2351 | 666033412701032449 | 128 | 47 |
| 2352 | 666029285002620928 | 132 | 48 |
| 2353 | 666020888022790149 | 2535 | 532 |
2354 rows × 3 columns
archive.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2356 entries, 0 to 2355 Data columns (total 17 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 tweet_id 2356 non-null int64 1 in_reply_to_status_id 78 non-null float64 2 in_reply_to_user_id 78 non-null float64 3 timestamp 2356 non-null object 4 source 2356 non-null object 5 text 2356 non-null object 6 retweeted_status_id 181 non-null float64 7 retweeted_status_user_id 181 non-null float64 8 retweeted_status_timestamp 181 non-null object 9 expanded_urls 2297 non-null object 10 rating_numerator 2356 non-null int64 11 rating_denominator 2356 non-null int64 12 name 2356 non-null object 13 doggo 2356 non-null object 14 floofer 2356 non-null object 15 pupper 2356 non-null object 16 puppo 2356 non-null object dtypes: float64(4), int64(3), object(10) memory usage: 313.0+ KB
Image_predictions.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2075 entries, 0 to 2074 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 tweet_id 2075 non-null int64 1 jpg_url 2075 non-null object 2 img_num 2075 non-null int64 3 p1 2075 non-null object 4 p1_conf 2075 non-null float64 5 p1_dog 2075 non-null bool 6 p2 2075 non-null object 7 p2_conf 2075 non-null float64 8 p2_dog 2075 non-null bool 9 p3 2075 non-null object 10 p3_conf 2075 non-null float64 11 p3_dog 2075 non-null bool dtypes: bool(3), float64(3), int64(2), object(4) memory usage: 152.1+ KB
tweet_json.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2354 entries, 0 to 2353 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 tweet_id 2354 non-null int64 1 favorite_count 2354 non-null int64 2 retweet_count 2354 non-null int64 dtypes: int64(3) memory usage: 55.3 KB
archive.tweet_id.duplicated().sum()
0
archive['in_reply_to_status_id'].notnull().sum()
78
archive['retweeted_status_id'].notnull().sum()
181
archive.name.value_counts().head(60)
None 745 a 55 Charlie 12 Cooper 11 Lucy 11 Oliver 11 Tucker 10 Penny 10 Lola 10 Winston 9 Bo 9 Sadie 8 the 8 Daisy 7 Buddy 7 Toby 7 an 7 Bailey 7 Leo 6 Oscar 6 Stanley 6 Rusty 6 Bella 6 Dave 6 Jack 6 Koda 6 Scout 6 Jax 6 Milo 6 Louis 5 Phil 5 Sammy 5 Gus 5 Chester 5 Alfie 5 Sunny 5 Bentley 5 very 5 Oakley 5 Finn 5 George 5 Larry 5 Clarence 4 Loki 4 Walter 4 Reggie 4 Hank 4 Scooter 4 Duke 4 quite 4 Reginald 4 Maggie 4 Luna 4 Maddie 4 Dexter 4 Jerry 4 Carl 4 Ruby 4 Riley 4 Clark 4 Name: name, dtype: int64
archive.rating_numerator.describe()
count 2356.000000 mean 13.126486 std 45.876648 min 0.000000 25% 10.000000 50% 11.000000 75% 12.000000 max 1776.000000 Name: rating_numerator, dtype: float64
archive.rating_numerator.unique()
array([ 13, 12, 14, 5, 17, 11, 10, 420, 666, 6, 15,
182, 960, 0, 75, 7, 84, 9, 24, 8, 1, 27,
3, 4, 165, 1776, 204, 50, 99, 80, 45, 60, 44,
143, 121, 20, 26, 2, 144, 88], dtype=int64)
archive.rating_denominator.describe()
count 2356.000000 mean 10.455433 std 6.745237 min 0.000000 25% 10.000000 50% 10.000000 75% 10.000000 max 170.000000 Name: rating_denominator, dtype: float64
archive.rating_denominator.unique()
array([ 10, 0, 15, 70, 7, 11, 150, 170, 20, 50, 90, 80, 40,
130, 110, 16, 120, 2], dtype=int64)
Image_predictions.describe()
| tweet_id | img_num | p1_conf | p2_conf | p3_conf | |
|---|---|---|---|---|---|
| count | 2.075000e+03 | 2075.000000 | 2075.000000 | 2.075000e+03 | 2.075000e+03 |
| mean | 7.384514e+17 | 1.203855 | 0.594548 | 1.345886e-01 | 6.032417e-02 |
| std | 6.785203e+16 | 0.561875 | 0.271174 | 1.006657e-01 | 5.090593e-02 |
| min | 6.660209e+17 | 1.000000 | 0.044333 | 1.011300e-08 | 1.740170e-10 |
| 25% | 6.764835e+17 | 1.000000 | 0.364412 | 5.388625e-02 | 1.622240e-02 |
| 50% | 7.119988e+17 | 1.000000 | 0.588230 | 1.181810e-01 | 4.944380e-02 |
| 75% | 7.932034e+17 | 1.000000 | 0.843855 | 1.955655e-01 | 9.180755e-02 |
| max | 8.924206e+17 | 4.000000 | 1.000000 | 4.880140e-01 | 2.734190e-01 |
tweet_json.describe()
| tweet_id | favorite_count | retweet_count | |
|---|---|---|---|
| count | 2.354000e+03 | 2354.000000 | 2354.000000 |
| mean | 7.426978e+17 | 8080.968564 | 3164.797366 |
| std | 6.852812e+16 | 11814.771334 | 5284.770364 |
| min | 6.660209e+17 | 0.000000 | 0.000000 |
| 25% | 6.783975e+17 | 1415.000000 | 624.500000 |
| 50% | 7.194596e+17 | 3603.500000 | 1473.500000 |
| 75% | 7.993058e+17 | 10122.250000 | 3652.000000 |
| max | 8.924206e+17 | 132810.000000 | 79515.000000 |
archive tableImage_predictions table# Make copies of original pieces of data
archive_clean = archive.copy()
Image_predictions_clean = Image_predictions.copy()
tweet_json_clean = tweet_json.copy()
archive_clean.drop(archive_clean[(archive_clean['in_reply_to_status_id'].notnull()) ].index, inplace = True)
archive_clean.drop(archive_clean[(archive_clean['retweeted_status_id'].notnull()) ].index, inplace = True)
archive_clean['in_reply_to_status_id'].notnull().sum()
0
archive_clean['retweeted_status_id'].notnull().sum()
0
archive_clean.drop(['in_reply_to_status_id','in_reply_to_user_id','source', 'retweeted_status_id', 'retweeted_status_user_id','retweeted_status_timestamp' ], axis=1, inplace=True)
archive_clean.shape
(2097, 11)
# tweet_id to string
archive_clean['tweet_id'] = archive_clean['tweet_id'].astype(str)
Image_predictions_clean['tweet_id'] = Image_predictions_clean['tweet_id'].astype(str)
tweet_json_clean['tweet_id'] = tweet_json_clean['tweet_id'].astype(str)
archive_clean.tweet_id.dtypes, Image_predictions_clean.tweet_id.dtypes , tweet_json_clean.tweet_id.dtypes
(dtype('O'), dtype('O'), dtype('O'))
# timestamp to datetime
archive_clean['timestamp'] = pd.to_datetime(archive_clean['timestamp'])
archive_clean.rename(columns={'timestamp':"tweet_date"}, inplace=True)
archive_clean.tweet_date.dtypes
datetime64[ns, UTC]
archive_clean['dog_stages'] = archive_clean['doggo'] + archive_clean['floofer'] + archive_clean['pupper'] + archive_clean['puppo']
# Replace None with empty string
empty_string = lambda x: x.replace("None", "")
archive_clean['dog_stages']= archive_clean['dog_stages'].apply(empty_string)
archive_clean.dog_stages.value_counts()
1761 pupper 221 doggo 72 puppo 23 floofer 9 doggopupper 9 doggopuppo 1 doggofloofer 1 Name: dog_stages, dtype: int64
# Drop 'doggo', 'floofer', 'pupper', 'puppo' columns
archive_clean.drop(['doggo', 'floofer', 'pupper', 'puppo' ], axis=1, inplace=True)
archive_clean[archive_clean['dog_stages']=='doggopupper']
| tweet_id | tweet_date | text | expanded_urls | rating_numerator | rating_denominator | name | dog_stages | |
|---|---|---|---|---|---|---|---|---|
| 460 | 817777686764523521 | 2017-01-07 16:59:28+00:00 | This is Dido. She's playing the lead role in "... | https://twitter.com/dog_rates/status/817777686... | 13 | 10 | Dido | doggopupper |
| 531 | 808106460588765185 | 2016-12-12 00:29:28+00:00 | Here we have Burke (pupper) and Dexter (doggo)... | https://twitter.com/dog_rates/status/808106460... | 12 | 10 | None | doggopupper |
| 575 | 801115127852503040 | 2016-11-22 17:28:25+00:00 | This is Bones. He's being haunted by another d... | https://twitter.com/dog_rates/status/801115127... | 12 | 10 | Bones | doggopupper |
| 705 | 785639753186217984 | 2016-10-11 00:34:48+00:00 | This is Pinot. He's a sophisticated doggo. You... | https://twitter.com/dog_rates/status/785639753... | 10 | 10 | Pinot | doggopupper |
| 733 | 781308096455073793 | 2016-09-29 01:42:20+00:00 | Pupper butt 1, Doggo 0. Both 12/10 https://t.c... | https://vine.co/v/5rgu2Law2ut | 12 | 10 | None | doggopupper |
| 889 | 759793422261743616 | 2016-07-31 16:50:42+00:00 | Meet Maggie & Lila. Maggie is the doggo, L... | https://twitter.com/dog_rates/status/759793422... | 12 | 10 | Maggie | doggopupper |
| 956 | 751583847268179968 | 2016-07-09 01:08:47+00:00 | Please stop sending it pictures that don't eve... | https://twitter.com/dog_rates/status/751583847... | 5 | 10 | None | doggopupper |
| 1063 | 741067306818797568 | 2016-06-10 00:39:48+00:00 | This is just downright precious af. 12/10 for ... | https://twitter.com/dog_rates/status/741067306... | 12 | 10 | just | doggopupper |
| 1113 | 733109485275860992 | 2016-05-19 01:38:16+00:00 | Like father (doggo), like son (pupper). Both 1... | https://twitter.com/dog_rates/status/733109485... | 12 | 10 | None | doggopupper |
# After visual assessment Pinot is not a dog
archive_clean.drop([705], inplace=True)
archive_clean[archive_clean.name=='Pinot']
| tweet_id | tweet_date | text | expanded_urls | rating_numerator | rating_denominator | name | dog_stages |
|---|
# After visual assessment Dido is a Pupper, will be changed manually
archive_clean.at[460,'dog_stages']='pupper'
archive_clean.loc[460, : ]
tweet_id 817777686764523521 tweet_date 2017-01-07 16:59:28+00:00 text This is Dido. She's playing the lead role in "... expanded_urls https://twitter.com/dog_rates/status/817777686... rating_numerator 13 rating_denominator 10 name Dido dog_stages pupper Name: 460, dtype: object
# After visual assessment Bones is a Pupper, will be changed manually
archive_clean.at[575,'dog_stages']='pupper'
archive_clean.loc[575, : ]
tweet_id 801115127852503040 tweet_date 2016-11-22 17:28:25+00:00 text This is Bones. He's being haunted by another d... expanded_urls https://twitter.com/dog_rates/status/801115127... rating_numerator 12 rating_denominator 10 name Bones dog_stages pupper Name: 575, dtype: object
archive_clean[archive_clean['dog_stages']=='doggopuppo']
| tweet_id | tweet_date | text | expanded_urls | rating_numerator | rating_denominator | name | dog_stages | |
|---|---|---|---|---|---|---|---|---|
| 191 | 855851453814013952 | 2017-04-22 18:31:02+00:00 | Here's a puppo participating in the #ScienceMa... | https://twitter.com/dog_rates/status/855851453... | 13 | 10 | None | doggopuppo |
# After visual assessment this dog is a puppo
archive_clean.at[191,'dog_stages']='puppo'
archive_clean.loc[191, : ]
tweet_id 855851453814013952 tweet_date 2017-04-22 18:31:02+00:00 text Here's a puppo participating in the #ScienceMa... expanded_urls https://twitter.com/dog_rates/status/855851453... rating_numerator 13 rating_denominator 10 name None dog_stages puppo Name: 191, dtype: object
archive_clean[archive_clean['dog_stages']=='doggofloofer']
| tweet_id | tweet_date | text | expanded_urls | rating_numerator | rating_denominator | name | dog_stages | |
|---|---|---|---|---|---|---|---|---|
| 200 | 854010172552949760 | 2017-04-17 16:34:26+00:00 | At first I thought this was a shy doggo, but i... | https://twitter.com/dog_rates/status/854010172... | 11 | 10 | None | doggofloofer |
# After visual assessment
archive_clean.at[200,'dog_stages']='floofer'
archive_clean.loc[200, : ]
tweet_id 854010172552949760 tweet_date 2017-04-17 16:34:26+00:00 text At first I thought this was a shy doggo, but i... expanded_urls https://twitter.com/dog_rates/status/854010172... rating_numerator 11 rating_denominator 10 name None dog_stages floofer Name: 200, dtype: object
# After visual assessment, all other doggopupper stages are actually for two dogs , we will seperate it with a ','
archive_clean['dog_stages'] = np.where(archive_clean['dog_stages']=='doggopupper', 'doggo,pupper', archive_clean['dog_stages'])
archive_clean.dog_stages.value_counts()
1761 pupper 223 doggo 72 puppo 24 floofer 10 doggo,pupper 6 Name: dog_stages, dtype: int64
# Replace invalid names extracted from the name column with a nan value
archive_clean.name.replace(regex=r'(^[a-z]+)',value=np.nan, inplace=True )
# Extract dogs's names after 'This is ...'
archive_clean['name_after_Thisis'] = archive_clean['text'].str.extract('This\sis\s([a-zA-Z]+)', expand=True)
# Delete not valid names after 'This is ...'
archive_clean.name_after_Thisis.replace(regex=r'(^[a-z]+)',value=np.nan, inplace=True )
# Extract dogs's names after 'named ...'
archive_clean['name_after_named'] = archive_clean['text'].str.extract('named ([A-Za-z]+)\.', expand=True)
# Create new name column with the right names extracted from text
archive_clean['Names'] =archive_clean['name_after_Thisis'].fillna('') + archive_clean['name_after_named'].fillna('')
#Drop existibg name column
archive_clean.drop(['name','name_after_Thisis','name_after_named' ], axis=1, inplace=True)
# Verify if there are any invalid names left
invalid_names = archive_clean['Names'].str.extract('(^[a-z]+)', expand=True)
invalid_names.value_counts()
Series([], dtype: int64)
archive_clean['Names'].value_counts().head(60)
997 Lucy 10 Cooper 9 Tucker 8 Charlie 8 Penny 8 Oliver 8 Lola 6 Bella 6 Oscar 6 Bo 6 Louis 5 Scout 5 Buddy 5 Bailey 5 Koda 5 Finn 4 Daisy 4 Derek 4 Chip 4 Jerry 4 Toby 4 Leo 4 Jeffrey 4 Stanley 4 Bentley 4 Sadie 4 Clark 4 Alfie 4 Cassie 4 Dexter 4 Archie 4 Gus 4 Wallace 3 Duke 3 Winnie 3 Milo 3 Mia 3 Walter 3 Ruby 3 Brody 3 Maximus 3 Dave 3 Oakley 3 Riley 3 Carl 3 Kyle 3 Sophie 3 Wyatt 3 Rosie 3 Reginald 3 Phil 3 Bruce 3 Jimothy 3 Bear 3 Waffles 3 Klevin 3 Rusty 3 Winston 3 Chester 3 Name: Names, dtype: int64
list(archive_clean)
['tweet_id', 'tweet_date', 'text', 'expanded_urls', 'rating_numerator', 'rating_denominator', 'dog_stages', 'Names']
# Extract rating numerator from text
archive_clean['rating_numerator_from_text'] = archive_clean['text'].str.extract('(\d+\.*\d*)\/\d+', expand=True)
archive_clean['rating_numerator_from_text'].value_counts()
12 486 10 435 11 413 13 287 9 153 8 98 7 52 14 38 5 33 6 32 3 19 4 16 2 9 1 5 11.27 1 9.75 1 88 1 144 1 11.26 1 121 1 44 1 60 1 45 1 80 1 99 1 50 1 204 1 1776 1 165 1 13.5 1 0 1 84 1 24 1 420 1 Name: rating_numerator_from_text, dtype: int64
archive_clean['rating_denominator_from_text'] = archive_clean['text'].str.extract('\d+\.*\d*/(\d+)', expand=True)
archive_clean['rating_denominator_from_text'].value_counts()
10 2079 50 3 11 2 80 2 70 1 7 1 150 1 170 1 20 1 90 1 40 1 110 1 120 1 2 1 Name: rating_denominator_from_text, dtype: int64
archive_clean['rating_denominator_from_text']=archive_clean['rating_denominator_from_text'].astype('float')
archive_clean['rating_numerator_from_text']=archive_clean['rating_numerator_from_text'].astype('float')
# Drop non dog rates after visual assessment
archive_clean.drop([2091], inplace=True)
archive_clean.drop([2261], inplace=True)
archive_clean.drop([2338], inplace=True)
archive_clean.drop([315], inplace=True)
archive_clean.drop([1761], inplace=True)
archive_clean.drop([2079], inplace=True)
archive_clean.drop([2237], inplace=True)
archive_clean.drop([2349], inplace=True)
archive_clean.drop([2074], inplace=True)
# Change ratting numerator after visual assessment
archive_clean.at[2335,'rating_numerator_from_text']=9
archive_clean.at[1202,'rating_numerator_from_text']=11
archive_clean.loc[1202, : ]
tweet_id 716439118184652801 tweet_date 2016-04-03 01:36:11+00:00 text This is Bluebert. He just saw that both #Final... expanded_urls https://twitter.com/dog_rates/status/716439118... rating_numerator 50 rating_denominator 50 dog_stages Names Bluebert rating_numerator_from_text 11.0 rating_denominator_from_text 50.0 Name: 1202, dtype: object
# Logan is a gryffindor dog, that's why he has a 'Nine and three quarters' rating, but actually he is a 13, will be changed manually
archive_clean.at[695,'rating_numerator_from_text']=13
archive_clean.loc[695, : ]
tweet_id 786709082849828864 tweet_date 2016-10-13 23:23:56+00:00 text This is Logan, the Chow who lived. He solemnly... expanded_urls https://twitter.com/dog_rates/status/786709082... rating_numerator 75 rating_denominator 10 dog_stages Names Logan rating_numerator_from_text 13.0 rating_denominator_from_text 10.0 Name: 695, dtype: object
# This is not a rating
archive_clean.rating_numerator_from_text.replace (24.00,value=np.nan, inplace=True )
archive_clean.rating_denominator_from_text.replace (7,value=np.nan, inplace=True )
# Normalize the rating of groups to have a rating denominator equal to 10 for all tweets
archive_clean['Rating_numerator'] = (archive_clean['rating_numerator_from_text']/archive_clean['rating_denominator_from_text'])*10
# Drop unnecessary columns
archive_clean.drop(['rating_numerator_from_text','rating_denominator_from_text','rating_numerator' ,'rating_denominator' ], axis=1, inplace=True)
archive_clean['Rating_numerator'].value_counts()
12.000000 490 10.000000 436 11.000000 418 13.000000 288 9.000000 153 8.000000 98 7.000000 51 14.000000 38 5.000000 33 6.000000 32 3.000000 19 4.000000 15 2.000000 6 11.270000 1 13.500000 1 1776.000000 1 8.181818 1 2.200000 1 6.363636 1 11.260000 1 1.000000 1 45.000000 1 Name: Rating_numerator, dtype: int64
list(archive_clean)
['tweet_id', 'tweet_date', 'text', 'expanded_urls', 'dog_stages', 'Names', 'Rating_numerator']
Image_predictions_clean = Image_predictions_clean.rename(columns={
'p1':'Image_prediction1', 'p1_conf':'confirmation1', 'p1_dog':'Dog_prediction1',
'p2':'Image_prediction2', 'p2_conf':'confirmation2', 'p2_dog':'Dog_prediction2',
'p3':'Image_prediction3', 'p3_conf':'confirmation3', 'p3_dog':'Dog_prediction3'
})
list(Image_predictions_clean)
['tweet_id', 'jpg_url', 'img_num', 'Image_prediction1', 'confirmation1', 'Dog_prediction1', 'Image_prediction2', 'confirmation2', 'Dog_prediction2', 'Image_prediction3', 'confirmation3', 'Dog_prediction3']
Image_predictions_clean['Image_prediction1'] = Image_predictions_clean['Image_prediction1'].str.title().str.replace('_', ' ')
Image_predictions_clean['Image_prediction2'] = Image_predictions_clean['Image_prediction2'].str.title().str.replace('_', ' ')
Image_predictions_clean['Image_prediction3'] = Image_predictions_clean['Image_prediction3'].str.title().str.replace('_', ' ')
Image_predictions_clean
| tweet_id | jpg_url | img_num | Image_prediction1 | confirmation1 | Dog_prediction1 | Image_prediction2 | confirmation2 | Dog_prediction2 | Image_prediction3 | confirmation3 | Dog_prediction3 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 666020888022790149 | https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg | 1 | Welsh Springer Spaniel | 0.465074 | True | Collie | 0.156665 | True | Shetland Sheepdog | 0.061428 | True |
| 1 | 666029285002620928 | https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg | 1 | Redbone | 0.506826 | True | Miniature Pinscher | 0.074192 | True | Rhodesian Ridgeback | 0.072010 | True |
| 2 | 666033412701032449 | https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg | 1 | German Shepherd | 0.596461 | True | Malinois | 0.138584 | True | Bloodhound | 0.116197 | True |
| 3 | 666044226329800704 | https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg | 1 | Rhodesian Ridgeback | 0.408143 | True | Redbone | 0.360687 | True | Miniature Pinscher | 0.222752 | True |
| 4 | 666049248165822465 | https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg | 1 | Miniature Pinscher | 0.560311 | True | Rottweiler | 0.243682 | True | Doberman | 0.154629 | True |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2070 | 891327558926688256 | https://pbs.twimg.com/media/DF6hr6BUMAAzZgT.jpg | 2 | Basset | 0.555712 | True | English Springer | 0.225770 | True | German Short-Haired Pointer | 0.175219 | True |
| 2071 | 891689557279858688 | https://pbs.twimg.com/media/DF_q7IAWsAEuuN8.jpg | 1 | Paper Towel | 0.170278 | False | Labrador Retriever | 0.168086 | True | Spatula | 0.040836 | False |
| 2072 | 891815181378084864 | https://pbs.twimg.com/media/DGBdLU1WsAANxJ9.jpg | 1 | Chihuahua | 0.716012 | True | Malamute | 0.078253 | True | Kelpie | 0.031379 | True |
| 2073 | 892177421306343426 | https://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg | 1 | Chihuahua | 0.323581 | True | Pekinese | 0.090647 | True | Papillon | 0.068957 | True |
| 2074 | 892420643555336193 | https://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg | 1 | Orange | 0.097049 | False | Bagel | 0.085851 | False | Banana | 0.076110 | False |
2075 rows × 12 columns
Image_predictions_clean['Image_prediction1'] = Image_predictions_clean['Image_prediction1'].astype('category')
Image_predictions_clean['Image_prediction2'] = Image_predictions_clean['Image_prediction2'].astype('category')
Image_predictions_clean['Image_prediction3'] = Image_predictions_clean['Image_prediction3'].astype('category')
Image_predictions_clean.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2075 entries, 0 to 2074 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 tweet_id 2075 non-null object 1 jpg_url 2075 non-null object 2 img_num 2075 non-null int64 3 Image_prediction1 2075 non-null category 4 confirmation1 2075 non-null float64 5 Dog_prediction1 2075 non-null bool 6 Image_prediction2 2075 non-null category 7 confirmation2 2075 non-null float64 8 Dog_prediction2 2075 non-null bool 9 Image_prediction3 2075 non-null category 10 confirmation3 2075 non-null float64 11 Dog_prediction3 2075 non-null bool dtypes: bool(3), category(3), float64(3), int64(1), object(2) memory usage: 165.4+ KB
twitter_archive = pd.merge(archive_clean, Image_predictions_clean, on= 'tweet_id', how='left')
twitter_archive_master = pd.merge(twitter_archive, tweet_json_clean, on= 'tweet_id', how='left')
twitter_archive_master.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 2087 entries, 0 to 2086 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 tweet_id 2087 non-null object 1 tweet_date 2087 non-null datetime64[ns, UTC] 2 text 2087 non-null object 3 expanded_urls 2084 non-null object 4 dog_stages 2087 non-null object 5 Names 2087 non-null object 6 Rating_numerator 2086 non-null float64 7 jpg_url 1961 non-null object 8 img_num 1961 non-null float64 9 Image_prediction1 1961 non-null category 10 confirmation1 1961 non-null float64 11 Dog_prediction1 1961 non-null object 12 Image_prediction2 1961 non-null category 13 confirmation2 1961 non-null float64 14 Dog_prediction2 1961 non-null object 15 Image_prediction3 1961 non-null category 16 confirmation3 1961 non-null float64 17 Dog_prediction3 1961 non-null object 18 favorite_count 2087 non-null int64 19 retweet_count 2087 non-null int64 dtypes: category(3), datetime64[ns, UTC](1), float64(5), int64(2), object(9) memory usage: 355.4+ KB
twitter_archive_master.to_csv('twitter_archive_master.csv',index=False)
from sqlalchemy import create_engine
# Create SQLAlchemy Engine and empty twitter archive database
engine = create_engine('sqlite:///twitter_archive_master.db')
# Store cleaned master DataFrame ('twitter_archive_master') in a table called twitter_archive_master in twitter_archive_master.db
twitter_archive_master.to_sql('twitter_archive_master', engine, index=False)
2087
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
twitter_archive_master = pd.read_csv('twitter_archive_master.csv')
twitter_archive_master
| tweet_id | tweet_date | text | expanded_urls | dog_stages | Names | Rating_numerator | jpg_url | img_num | Image_prediction1 | confirmation1 | Dog_prediction1 | Image_prediction2 | confirmation2 | Dog_prediction2 | Image_prediction3 | confirmation3 | Dog_prediction3 | favorite_count | retweet_count | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 892420643555336193 | 2017-08-01 16:23:56+00:00 | This is Phineas. He's a mystical boy. Only eve... | https://twitter.com/dog_rates/status/892420643... | NaN | Phineas | 13.0 | https://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg | 1.0 | Orange | 0.097049 | False | Bagel | 0.085851 | False | Banana | 0.076110 | False | 39467 | 8853 |
| 1 | 892177421306343426 | 2017-08-01 00:17:27+00:00 | This is Tilly. She's just checking pup on you.... | https://twitter.com/dog_rates/status/892177421... | NaN | Tilly | 13.0 | https://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg | 1.0 | Chihuahua | 0.323581 | True | Pekinese | 0.090647 | True | Papillon | 0.068957 | True | 33819 | 6514 |
| 2 | 891815181378084864 | 2017-07-31 00:18:03+00:00 | This is Archie. He is a rare Norwegian Pouncin... | https://twitter.com/dog_rates/status/891815181... | NaN | Archie | 12.0 | https://pbs.twimg.com/media/DGBdLU1WsAANxJ9.jpg | 1.0 | Chihuahua | 0.716012 | True | Malamute | 0.078253 | True | Kelpie | 0.031379 | True | 25461 | 4328 |
| 3 | 891689557279858688 | 2017-07-30 15:58:51+00:00 | This is Darla. She commenced a snooze mid meal... | https://twitter.com/dog_rates/status/891689557... | NaN | Darla | 13.0 | https://pbs.twimg.com/media/DF_q7IAWsAEuuN8.jpg | 1.0 | Paper Towel | 0.170278 | False | Labrador Retriever | 0.168086 | True | Spatula | 0.040836 | False | 42908 | 8964 |
| 4 | 891327558926688256 | 2017-07-29 16:00:24+00:00 | This is Franklin. He would like you to stop ca... | https://twitter.com/dog_rates/status/891327558... | NaN | Franklin | 12.0 | https://pbs.twimg.com/media/DF6hr6BUMAAzZgT.jpg | 2.0 | Basset | 0.555712 | True | English Springer | 0.225770 | True | German Short-Haired Pointer | 0.175219 | True | 41048 | 9774 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2082 | 666049248165822465 | 2015-11-16 00:24:50+00:00 | Here we have a 1949 1st generation vulpix. Enj... | https://twitter.com/dog_rates/status/666049248... | NaN | NaN | 5.0 | https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg | 1.0 | Miniature Pinscher | 0.560311 | True | Rottweiler | 0.243682 | True | Doberman | 0.154629 | True | 111 | 41 |
| 2083 | 666044226329800704 | 2015-11-16 00:04:52+00:00 | This is a purebred Piers Morgan. Loves to Netf... | https://twitter.com/dog_rates/status/666044226... | NaN | NaN | 6.0 | https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg | 1.0 | Rhodesian Ridgeback | 0.408143 | True | Redbone | 0.360687 | True | Miniature Pinscher | 0.222752 | True | 311 | 147 |
| 2084 | 666033412701032449 | 2015-11-15 23:21:54+00:00 | Here is a very happy pup. Big fan of well-main... | https://twitter.com/dog_rates/status/666033412... | NaN | NaN | 9.0 | https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg | 1.0 | German Shepherd | 0.596461 | True | Malinois | 0.138584 | True | Bloodhound | 0.116197 | True | 128 | 47 |
| 2085 | 666029285002620928 | 2015-11-15 23:05:30+00:00 | This is a western brown Mitsubishi terrier. Up... | https://twitter.com/dog_rates/status/666029285... | NaN | NaN | 7.0 | https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg | 1.0 | Redbone | 0.506826 | True | Miniature Pinscher | 0.074192 | True | Rhodesian Ridgeback | 0.072010 | True | 132 | 48 |
| 2086 | 666020888022790149 | 2015-11-15 22:32:08+00:00 | Here we have a Japanese Irish Setter. Lost eye... | https://twitter.com/dog_rates/status/666020888... | NaN | NaN | 8.0 | https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg | 1.0 | Welsh Springer Spaniel | 0.465074 | True | Collie | 0.156665 | True | Shetland Sheepdog | 0.061428 | True | 2535 | 532 |
2087 rows × 20 columns
correlation = twitter_archive_master["favorite_count"].corr(twitter_archive_master["retweet_count"])
correlation
0.9116032698106865
The correlation between the favorite_count and the retweet_count is around 0.9116. The number is closer to 1, which means these two parameters are positively correlated.
sns.color_palette()
fig1 = sns.regplot(x=twitter_archive_master["favorite_count"], y=twitter_archive_master["retweet_count"],
scatter_kws={"color": 'darkgreen'}, line_kws={"color": "red"})
fig1.set(xlabel='favorite_count', ylabel='retweet_count',
title = "Correlation matrix of favorite count and retweet count data");
Top_rating_favorite_stages = pd.DataFrame(twitter_archive_master.groupby(['dog_stages','Rating_numerator']).sum()['favorite_count'].sort_values(ascending = False).head(10)).reset_index()
Top_rating_favorite_stages
| dog_stages | Rating_numerator | favorite_count | |
|---|---|---|---|
| 0 | doggo | 13.0 | 536789 |
| 1 | pupper | 12.0 | 449907 |
| 2 | pupper | 13.0 | 376697 |
| 3 | puppo | 13.0 | 351106 |
| 4 | pupper | 11.0 | 292288 |
| 5 | doggo | 14.0 | 256874 |
| 6 | doggo | 12.0 | 237380 |
| 7 | doggo | 11.0 | 188781 |
| 8 | pupper | 14.0 | 185428 |
| 9 | pupper | 10.0 | 172021 |
fig2 = px.bar(Top_rating_favorite_stages, x='dog_stages', y='favorite_count',title="Top favorite dog stages per rating",
text_auto='.3s',
color="Rating_numerator",)
fig2.update_layout(xaxis={'categoryorder':'total descending',},
yaxis={'visible': False, 'showticklabels': False},
xaxis_title=None)
fig2.update_traces(textfont_size=15, textangle=0,
cliponaxis=False
)
Top_rating_retweeted_Names = pd.DataFrame(twitter_archive_master.groupby(['Names','Rating_numerator','dog_stages']).sum()['retweet_count'].sort_values(ascending = False).head(10)).reset_index()
Top_rating_retweeted_Names
| Names | Rating_numerator | dog_stages | retweet_count | |
|---|---|---|---|---|
| 0 | Bo | 14.0 | doggo | 42228 |
| 1 | Jamesy | 13.0 | pupper | 32883 |
| 2 | Sunny | 14.0 | doggo | 21794 |
| 3 | Gabe | 14.0 | pupper | 17209 |
| 4 | Barney | 13.0 | doggo | 13076 |
| 5 | Cupid | 13.0 | doggo | 11878 |
| 6 | Sebastian | 13.0 | puppo | 11007 |
| 7 | Astrid | 13.0 | doggo | 10706 |
| 8 | Reginald | 12.0 | puppo | 9374 |
| 9 | Cassie | 14.0 | doggo | 7711 |
fig3 = px.bar(Top_rating_retweeted_Names, x='Names', y='retweet_count',title="Top retweeted Names per rating",
text_auto='.3s',
color="Rating_numerator",)
fig3.update_layout(#xaxis={'categoryorder':'total descending',},
yaxis={'visible': False, 'showticklabels': False},
xaxis_title=None)
fig3.update_traces(textfont_size=15, textangle=0,
textposition="outside",
cliponaxis=False
)
Bo = twitter_archive_master.query('Names == "Bo" & Rating_numerator == 14')
Bo
| tweet_id | tweet_date | text | expanded_urls | dog_stages | Names | Rating_numerator | jpg_url | img_num | Image_prediction1 | confirmation1 | Dog_prediction1 | Image_prediction2 | confirmation2 | Dog_prediction2 | Image_prediction3 | confirmation3 | Dog_prediction3 | favorite_count | retweet_count | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 349 | 819004803107983360 | 2017-01-11 02:15:36+00:00 | This is Bo. He was a very good First Doggo. 14... | https://twitter.com/dog_rates/status/819004803... | doggo | Bo | 14.0 | https://pbs.twimg.com/media/C12whDoVEAALRxa.jpg | 1.0 | Standard Poodle | 0.351308 | True | Toy Poodle | 0.271929 | True | Tibetan Terrier | 0.094759 | True | 95450 | 42228 |
from IPython.display import Image
Image(url= "https://pbs.twimg.com/media/C12whDoVEAALRxa.jpg", width=500, height=300)